library(tidyverse)
library(readxl) # for importing raw data
library(haven)

## Load data ####

# load WFH measure
wfh_occ_educ <- read_stata("Raw data/wfh_occ_educ.dta")

# Obtain fmli education data and merge into new wfh data set
fmli193_wfh <- read_excel("Raw data/fmli193.xlsx") %>% 
  select(NEWID, EDUC_REF, STATE, EARNCOMP, EDUCA2, FAM_TYPE, HIGH_EDU, 
         INCOMEY1, INCOMEY2, PSU, OCCUCOD1, OCCUCOD2, INTERI, FINCBTAX)
fmli194_wfh <- read_excel("Raw data/fmli194.xlsx") %>% 
  select(NEWID, EDUC_REF, STATE, EARNCOMP, EDUCA2, FAM_TYPE, HIGH_EDU, 
         INCOMEY1, INCOMEY2, PSU, OCCUCOD1, OCCUCOD2, INTERI, FINCBTAX)
fmli201_wfh <- read_excel("Raw data/fmli201.xlsx") %>% 
  select(NEWID, EDUC_REF, STATE, EARNCOMP, EDUCA2, FAM_TYPE, HIGH_EDU, 
         INCOMEY1, INCOMEY2, PSU, OCCUCOD1, OCCUCOD2, INTERI, FINCBTAX)
fmli202_wfh <- read_excel("Raw data/fmli202.xlsx") %>% 
  select(NEWID, EDUC_REF, STATE, EARNCOMP, EDUCA2, FAM_TYPE, HIGH_EDU, 
         INCOMEY1, INCOMEY2, PSU, OCCUCOD1, OCCUCOD2, INTERI, FINCBTAX)
fmli203_wfh <- read_excel("Raw data/fmli203.xlsx") %>% 
  select(NEWID, EDUC_REF, STATE, EARNCOMP, EDUCA2, FAM_TYPE, HIGH_EDU, 
         INCOMEY1, INCOMEY2, PSU, OCCUCOD1, OCCUCOD2, INTERI, FINCBTAX)
fmli204_wfh <- read_excel("Raw data/fmli204.xlsx") %>% 
  select(NEWID, EDUC_REF, STATE, EARNCOMP, EDUCA2, FAM_TYPE, HIGH_EDU, 
         INCOMEY1, INCOMEY2, PSU, OCCUCOD1, OCCUCOD2, INTERI, FINCBTAX)
fmli211_wfh <- read_excel("Raw data/fmli211.xlsx") %>% 
  select(NEWID, EDUC_REF, STATE, EARNCOMP, EDUCA2, FAM_TYPE, HIGH_EDU, 
         INCOMEY1, INCOMEY2, PSU, OCCUCOD1, OCCUCOD2, INTERI, FINCBTAX)
fmli212_wfh <- read_excel("Raw data/fmli212.xlsx") %>% 
  select(NEWID, EDUC_REF, STATE, EARNCOMP, EDUCA2, FAM_TYPE, HIGH_EDU, 
         INCOMEY1, INCOMEY2, PSU, OCCUCOD1, OCCUCOD2, INTERI, FINCBTAX)

fmli_wfh <- rbind(fmli193_wfh, fmli194_wfh, fmli201_wfh, fmli202_wfh,
                  fmli203_wfh, fmli204_wfh, fmli211_wfh, fmli212_wfh)

fmli_wfh_inc <- rbind(fmli193_wfh, fmli194_wfh, fmli201_wfh)

# Load member files
memi193_wfh <- read_excel("Raw data/memi193.xlsx") %>% rename_all(toupper) %>% 
  select(NEWID, EARNER, EARNTYPE, EDUCA, IN_COLL, INC_HRSQ, INCOMEY, WKSTATUS,
         OCCUCODE, CU_CODE, SALARYB, SALARYBX, SALARYX, SALARYXI, AGE, INCNONWK)
memi194_wfh <- read_excel("Raw data/memi194.xlsx") %>% rename_all(toupper) %>% 
  select(NEWID, EARNER, EARNTYPE, EDUCA, IN_COLL, INC_HRSQ, INCOMEY, WKSTATUS,
         OCCUCODE, CU_CODE, SALARYB, SALARYBX, SALARYX, SALARYXI, AGE, INCNONWK)
memi201_wfh <- read_excel("Raw data/memi201.xlsx") %>% rename_all(toupper) %>% 
  select(NEWID, EARNER, EARNTYPE, EDUCA, IN_COLL, INC_HRSQ, INCOMEY, WKSTATUS,
         OCCUCODE, CU_CODE, SALARYB, SALARYBX, SALARYX, SALARYXI, AGE, INCNONWK)
memi202_wfh <- read_excel("Raw data/memi202.xlsx") %>% rename_all(toupper) %>% 
  select(NEWID, EARNER, EARNTYPE, EDUCA, IN_COLL, INC_HRSQ, INCOMEY, WKSTATUS,
         OCCUCODE, CU_CODE, SALARYB, SALARYBX, SALARYX, SALARYXI, AGE, INCNONWK)
memi203_wfh <- read_excel("Raw data/memi203.xlsx") %>% rename_all(toupper) %>% 
  select(NEWID, EARNER, EARNTYPE, EDUCA, IN_COLL, INC_HRSQ, INCOMEY, WKSTATUS,
         OCCUCODE, CU_CODE, SALARYB, SALARYBX, SALARYX, SALARYXI, AGE, INCNONWK)
memi204_wfh <- read_excel("Raw data/memi204.xlsx") %>% rename_all(toupper) %>% 
  select(NEWID, EARNER, EARNTYPE, EDUCA, IN_COLL, INC_HRSQ, INCOMEY, WKSTATUS,
         OCCUCODE, CU_CODE, SALARYB, SALARYBX, SALARYX, SALARYXI, AGE, INCNONWK)
memi211_wfh <- read_excel("Raw data/memi211.xlsx") %>% rename_all(toupper) %>% 
  select(NEWID, EARNER, EARNTYPE, EDUCA, IN_COLL, INC_HRSQ, INCOMEY, WKSTATUS,
         OCCUCODE, CU_CODE, SALARYB, SALARYBX, SALARYX, SALARYXI, AGE, INCNONWK)
memi212_wfh <- read_excel("Raw data/memi212.xlsx") %>% rename_all(toupper) %>% 
  select(NEWID, EARNER, EARNTYPE, EDUCA, IN_COLL, INC_HRSQ, INCOMEY, WKSTATUS,
         OCCUCODE, CU_CODE, SALARYB, SALARYBX, SALARYX, SALARYXI, AGE, INCNONWK)

memi_wfh_inc <- rbind(memi193_wfh, memi194_wfh, memi201_wfh)
memi_wfh <- rbind(memi193_wfh, memi194_wfh, memi201_wfh, memi202_wfh,
                  memi203_wfh, memi204_wfh, memi211_wfh, memi212_wfh)

## Income-weighted measure ####
memi_wfh_inc <- memi_wfh_inc %>% 
  filter(CU_CODE %in% c(1,0,2)) %>% # keep only spouses/partners and reference person
  rowwise() %>% mutate(educ_ce_cps = ifelse(EDUCA==1,1,EDUCA-1),
                       ind_ce_cps = OCCUCODE) %>% 
  mutate(pernum = ifelse(CU_CODE==1,"1","2")) %>% # reference person is person 1, spouse/partner is 2
  pivot_wider(
    names_from = pernum,
    values_from = c(EARNER, EARNTYPE, EDUCA, IN_COLL, INC_HRSQ, INCOMEY, OCCUCODE, CU_CODE, WKSTATUS,
                    SALARYB, SALARYBX, SALARYX, SALARYXI, AGE, INCNONWK, educ_ce_cps, ind_ce_cps))


wfh_inc <- fmli_wfh_inc %>% full_join(memi_wfh_inc, by = "NEWID") %>% 
  mutate(ID = floor(NEWID/10)) %>% 
  arrange(ID,INTERI) %>% 
  group_by(ID) %>% 
  filter(row_number()==1) %>% 
  ungroup()

# sanity checks, make sure data is complete. min, mean & max of check should be 1
wfh_inc %>% mutate(check = (is.na(SALARYBX_1) & !is.na(SALARYX_1)) |
                     (!is.na(SALARYBX_1) & is.na(SALARYX_1)) | 
                     (is.na(SALARYBX_1) & is.na(SALARYX_1))) %>% 
  summarise(check_min = min(check), check_mean = mean(check), check_max = max(check))
wfh_inc %>% mutate(check = (is.na(SALARYBX_2) & !is.na(SALARYX_2)) |
                     (!is.na(SALARYBX_2) & is.na(SALARYX_2)) | 
                     (is.na(SALARYBX_2) & is.na(SALARYX_2))) %>% 
  summarise(check_min = min(check), check_mean = mean(check), check_max = max(check))




# construct measures
wfh_inc <- wfh_inc %>% 
  left_join(wfh_occ_educ %>% 
              rename(ind_ce_cps_1 = ind_ce_cps, 
                     educ_ce_cps_1 = educ_ce_cps,
                     wfh_prop_1 = wfh_prop) %>%
              select(-count),
            by = c("ind_ce_cps_1","educ_ce_cps_1")) %>% 
  left_join(wfh_occ_educ %>% 
              rename(ind_ce_cps_2 = ind_ce_cps, 
                     educ_ce_cps_2 = educ_ce_cps,
                     wfh_prop_2 = wfh_prop) %>%
              select(-count),
            by = c("ind_ce_cps_2","educ_ce_cps_2")) %>% 
  rowwise() %>% 
  mutate(salary1 = ifelse(is.na(SALARYBX_1) & is.na(SALARYX_1), NA, sum(SALARYBX_1, SALARYX_1, na.rm=TRUE)), # take either salarybx or salaryx for each
         salary2 = ifelse(is.na(SALARYBX_2) & is.na(SALARYX_2), NA, sum(SALARYBX_2, SALARYX_2, na.rm=TRUE))) %>% 
  mutate(salary1 = ifelse(is.na(salary1) & !is.na(salary2), 0, salary1), # if one partner has NA salary and the other has positive salary, make NA into 0
         salary2 = ifelse(is.na(salary2) & !is.na(salary1), 0, salary2)) %>%
  mutate(wfh_prop_1 = ifelse(!is.na(salary1) & salary1==0,-1,wfh_prop_1), # if salary is going to be zero, doesnt matter what wfh is. this avoids missings.
         wfh_prop_2 = ifelse(!is.na(salary2) & salary2==0,-1,wfh_prop_2)) %>%
  mutate(sh_inc_retain_total = (FINCBTAX - ((1-wfh_prop_1)*salary1 + (1-wfh_prop_2)*salary2))/FINCBTAX, # construct main measures
         sh_inc_retain_salary = (wfh_prop_1*salary1 + wfh_prop_2*salary2)/(salary1 + salary2)) %>%
  mutate(sh_inc_retain_total = ifelse(is.na(sh_inc_retain_total) & (INCNONWK_1==1 | INCNONWK_1==4) & (INCNONWK_2==1 | INCNONWK_2==4), 1, sh_inc_retain_total), # if both are retired or unable to work, retention is 100%
         sh_inc_retain_salary = ifelse(is.na(sh_inc_retain_salary) & (INCNONWK_1==1 | INCNONWK_1==4) & (INCNONWK_2==1 | INCNONWK_2==4), 1, sh_inc_retain_salary)) %>%
  mutate(sh_inc_retain_total = ifelse(!is.na(INCNONWK_2) & !is.na(WKSTATUS_1) & !is.na(wfh_prop_1), (FINCBTAX - (1-wfh_prop_1)*salary1)/FINCBTAX, sh_inc_retain_total), # if person 2 unemployed, person 1 employed then take person 1's wfh
         sh_inc_retain_salary = ifelse(!is.na(INCNONWK_2) & !is.na(WKSTATUS_1) & !is.na(wfh_prop_1), wfh_prop_1, sh_inc_retain_salary),
         sh_inc_retain_total = ifelse(!is.na(INCNONWK_1) & !is.na(WKSTATUS_2) & !is.na(wfh_prop_2), (FINCBTAX - (1-wfh_prop_2)*salary2)/FINCBTAX, sh_inc_retain_total),
         sh_inc_retain_salary = ifelse(!is.na(INCNONWK_1) & !is.na(WKSTATUS_2) & !is.na(wfh_prop_2), wfh_prop_2, sh_inc_retain_salary)) %>%
  mutate(sh_inc_retain_total = ifelse(is.na(WKSTATUS_2) & is.na(INCNONWK_2) & !is.na(wfh_prop_1) & !is.na(WKSTATUS_1),(FINCBTAX - (1-wfh_prop_1)*salary1)/FINCBTAX,sh_inc_retain_total),
         sh_inc_retain_salary = ifelse(is.na(WKSTATUS_2) & is.na(INCNONWK_2) & !is.na(wfh_prop_1) & !is.na(WKSTATUS_1),wfh_prop_1,sh_inc_retain_salary)) %>% 
  mutate(sh_inc_retain_total = ifelse(is.na(EARNER_2) & is.na(WKSTATUS_2) & is.na(INCNONWK_2) & !is.na(INCNONWK_1) & (INCNONWK_1==1 | INCNONWK_1==4),1,sh_inc_retain_total),
         sh_inc_retain_salary = ifelse(is.na(EARNER_2) & is.na(WKSTATUS_2) & is.na(INCNONWK_2) & !is.na(INCNONWK_1) & (INCNONWK_1==1 | INCNONWK_1==4),1,sh_inc_retain_salary)) %>% 
  ungroup() %>% 
  arrange(ID) %>% 
  select(ID, sh_inc_retain_total, sh_inc_retain_salary)


wfh_inc <- wfh_inc %>% 
  filter(!(is.na(sh_inc_retain_total) & is.na(sh_inc_retain_salary)))

write.csv(wfh_inc,"wfh_inc.csv", row.names = FALSE)


## Industry-based measure ####
memi_wfh_ind <- memi_wfh %>% 
  select(-c(SALARYB, SALARYBX, SALARYX, SALARYXI)) %>% 
  filter(CU_CODE %in% c(1,0,2)) %>% # keep only spouses/partners and reference person
  rowwise() %>% mutate(educ_ce_cps = ifelse(EDUCA==1,1,EDUCA-1),
                       ind_ce_cps = OCCUCODE) %>% 
  mutate(pernum = ifelse(CU_CODE==1,"1","2")) %>% # reference person is person 1, spouse/partner is 2
  pivot_wider(
    names_from = pernum,
    values_from = c(EARNER, EARNTYPE, EDUCA, IN_COLL, INC_HRSQ, INCOMEY, OCCUCODE, CU_CODE, WKSTATUS,
                    AGE, INCNONWK, educ_ce_cps, ind_ce_cps))


wfh_ind <- fmli_wfh %>% full_join(memi_wfh_ind, by = "NEWID") %>% 
  mutate(ID = floor(NEWID/10)) %>% 
  arrange(ID,INTERI) %>% 
  group_by(ID) %>% 
  filter(row_number()==1) %>% 
  ungroup()

# construct measures
wfh_ind <- wfh_ind %>% 
  left_join(wfh_occ_educ %>% 
              rename(ind_ce_cps_1 = ind_ce_cps, 
                     educ_ce_cps_1 = educ_ce_cps,
                     wfh_prop_1 = wfh_prop) %>%
              select(-count),
            by = c("ind_ce_cps_1","educ_ce_cps_1")) %>% 
  left_join(wfh_occ_educ %>% 
              rename(ind_ce_cps_2 = ind_ce_cps, 
                     educ_ce_cps_2 = educ_ce_cps,
                     wfh_prop_2 = wfh_prop) %>%
              select(-count),
            by = c("ind_ce_cps_2","educ_ce_cps_2")) %>% 
  ungroup() %>% 
  mutate(wfh = case_when(
    !is.na(WKSTATUS_1) & !is.na(WKSTATUS_2) & !is.na(wfh_prop_1) & !is.na(wfh_prop_2) ~ (wfh_prop_1+wfh_prop_2)/2,
    !is.na(INCNONWK_2) & !is.na(WKSTATUS_1) & !is.na(wfh_prop_1) ~ wfh_prop_1,
    !is.na(INCNONWK_1) & !is.na(WKSTATUS_2) & !is.na(wfh_prop_2) ~ wfh_prop_2,
    (INCNONWK_1==1 | INCNONWK_1==4) & (INCNONWK_2==1 | INCNONWK_2==4) ~ 1,
    is.na(EARNER_2) & is.na(WKSTATUS_2) & is.na(INCNONWK_2) & !is.na(wfh_prop_1) & !is.na(WKSTATUS_1) ~  wfh_prop_1,
    is.na(EARNER_2) & is.na(WKSTATUS_2) & is.na(INCNONWK_2) & (INCNONWK_1==1 | INCNONWK_1==4) ~  1)) %>% 
  arrange(ID) %>% 
  select(ID, wfh) %>% 
  filter(!is.na(wfh))

write.csv(wfh_ind,"wfh_ind.csv", row.names = FALSE)
